{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Master notebook for data preparation."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np\n",
    "import codecs\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "\n",
    "##Additional libraries: need to install protobuf on Anaconda prompt with command \"conda install -c anaconda protobuf\"\n",
    "\n",
    "from transformers import AutoTokenizer, AutoModelForTokenClassification\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"gilf/french-camembert-postag-model\")\n",
    "model = AutoModelForTokenClassification.from_pretrained(\"gilf/french-camembert-postag-model\")\n",
    "from transformers import pipeline\n",
    "nlp_token_class = pipeline('ner', model=model, tokenizer=tokenizer, grouped_entities=True)\n",
    "\n",
    "##Core directory - TO BE CHANGED BY USER\n",
    "dir = 'C:/Users/80002088/Dropbox/Incumbency_Advantage_France/replication'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Merge manifestos with candidate ID and pre-process text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1962\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\80002088\\Anaconda3\\lib\\site-packages\\pandas\\io\\stata.py:1433: UnicodeWarning: \n",
      "One or more strings in the dta file could not be decoded using utf-8, and\n",
      "so the fallback encoding of latin-1 is being used.  This can happen when a file\n",
      "has been incorrectly encoded by Stata or some other software. You should verify\n",
      "the string values returned are correct.\n",
      "  warnings.warn(msg, UnicodeWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1967\n",
      "1968\n",
      "1973\n",
      "1978\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\80002088\\Anaconda3\\lib\\site-packages\\pandas\\io\\stata.py:1433: UnicodeWarning: \n",
      "One or more strings in the dta file could not be decoded using utf-8, and\n",
      "so the fallback encoding of latin-1 is being used.  This can happen when a file\n",
      "has been incorrectly encoded by Stata or some other software. You should verify\n",
      "the string values returned are correct.\n",
      "  warnings.warn(msg, UnicodeWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1981\n",
      "1993\n",
      "1997\n",
      "2017\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\80002088\\Anaconda3\\lib\\site-packages\\pandas\\io\\stata.py:1433: UnicodeWarning: \n",
      "One or more strings in the dta file could not be decoded using utf-8, and\n",
      "so the fallback encoding of latin-1 is being used.  This can happen when a file\n",
      "has been incorrectly encoded by Stata or some other software. You should verify\n",
      "the string values returned are correct.\n",
      "  warnings.warn(msg, UnicodeWarning)\n"
     ]
    }
   ],
   "source": [
    "%run notebooks/Prep_manifestos.ipynb"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Construct measures of candidate similarity to other candidates in same party"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1962\n",
      "1967\n",
      "1968\n",
      "1973\n",
      "1978\n",
      "1981\n",
      "1993\n",
      "1997\n",
      "2017\n",
      "1962\n",
      "1967\n",
      "1968\n",
      "1973\n",
      "1978\n",
      "1981\n",
      "1993\n",
      "1997\n",
      "2017\n",
      "1962\n",
      "1967\n",
      "1968\n",
      "1973\n",
      "1978\n",
      "1981\n",
      "1993\n",
      "1997\n",
      "2017\n",
      "1962\n",
      "1967\n",
      "1968\n",
      "1973\n",
      "1978\n",
      "1981\n",
      "1993\n",
      "1997\n",
      "2017\n"
     ]
    }
   ],
   "source": [
    "%run notebooks/Originality.ipynb"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Personal pronouns and past participles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1962\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\80002088\\Anaconda3\\lib\\site-packages\\pandas\\io\\stata.py:1433: UnicodeWarning: \n",
      "One or more strings in the dta file could not be decoded using utf-8, and\n",
      "so the fallback encoding of latin-1 is being used.  This can happen when a file\n",
      "has been incorrectly encoded by Stata or some other software. You should verify\n",
      "the string values returned are correct.\n",
      "  warnings.warn(msg, UnicodeWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1967\n",
      "1968\n",
      "1973\n",
      "1978\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\80002088\\Anaconda3\\lib\\site-packages\\pandas\\io\\stata.py:1433: UnicodeWarning: \n",
      "One or more strings in the dta file could not be decoded using utf-8, and\n",
      "so the fallback encoding of latin-1 is being used.  This can happen when a file\n",
      "has been incorrectly encoded by Stata or some other software. You should verify\n",
      "the string values returned are correct.\n",
      "  warnings.warn(msg, UnicodeWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1981\n",
      "1993\n",
      "1997\n",
      "2017\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\80002088\\Anaconda3\\lib\\site-packages\\pandas\\io\\stata.py:1433: UnicodeWarning: \n",
      "One or more strings in the dta file could not be decoded using utf-8, and\n",
      "so the fallback encoding of latin-1 is being used.  This can happen when a file\n",
      "has been incorrectly encoded by Stata or some other software. You should verify\n",
      "the string values returned are correct.\n",
      "  warnings.warn(msg, UnicodeWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1962\n",
      "Mean share of perso:\n",
      "0.0035052799556196715\n",
      "1967\n",
      "Mean share of perso:\n",
      "0.0034491053543104014\n",
      "1968\n",
      "Mean share of perso:\n",
      "0.00233819632857973\n",
      "1973\n",
      "Mean share of perso:\n",
      "0.0029684882687727615\n",
      "1978\n",
      "Mean share of perso:\n",
      "0.003548007030065978\n",
      "1981\n",
      "Mean share of perso:\n",
      "0.0042861597251776795\n",
      "1993\n",
      "Mean share of perso:\n",
      "0.003998451616759024\n",
      "1997\n",
      "Mean share of perso:\n",
      "0.003226613108515449\n",
      "2017\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean share of perso:\n",
      "0.005888817063271645\n",
      "1962\n",
      "0\n",
      "100\n",
      "200\n",
      "300\n",
      "400\n",
      "500\n",
      "600\n",
      "700\n",
      "800\n",
      "900\n",
      "1000\n",
      "1100\n",
      "1200\n",
      "1300\n",
      "1400\n",
      "1500\n",
      "1600\n",
      "1967\n",
      "0\n",
      "100\n",
      "200\n",
      "300\n",
      "400\n",
      "500\n",
      "600\n",
      "700\n",
      "800\n",
      "900\n",
      "1000\n",
      "1100\n",
      "1200\n",
      "1300\n",
      "1400\n",
      "1500\n",
      "1600\n",
      "1700\n",
      "1800\n",
      "1900\n",
      "2000\n",
      "1968\n",
      "0\n",
      "100\n",
      "200\n",
      "300\n",
      "400\n",
      "500\n",
      "600\n",
      "700\n",
      "800\n",
      "900\n",
      "1000\n",
      "1100\n",
      "1200\n",
      "1300\n",
      "1400\n",
      "1500\n",
      "1600\n",
      "1700\n",
      "1800\n",
      "1900\n",
      "2000\n",
      "2100\n",
      "2200\n",
      "1973\n",
      "0\n",
      "100\n",
      "200\n",
      "300\n",
      "400\n",
      "500\n",
      "600\n",
      "700\n",
      "800\n",
      "900\n",
      "1000\n",
      "1100\n",
      "1200\n",
      "1300\n",
      "1400\n",
      "1500\n",
      "1600\n",
      "1700\n",
      "1800\n",
      "1900\n",
      "2000\n",
      "2100\n",
      "2200\n",
      "2300\n",
      "2400\n",
      "2500\n",
      "2600\n",
      "2700\n",
      "2800\n",
      "2900\n",
      "1978\n",
      "0\n",
      "100\n",
      "200\n",
      "300\n",
      "400\n",
      "500\n",
      "600\n",
      "700\n",
      "800\n",
      "900\n",
      "1000\n",
      "1100\n",
      "1200\n",
      "1300\n",
      "1400\n",
      "1500\n",
      "1600\n",
      "1700\n",
      "1800\n",
      "1900\n",
      "2000\n",
      "2100\n",
      "2200\n",
      "2300\n",
      "2400\n",
      "2500\n",
      "2600\n",
      "2700\n",
      "2800\n",
      "2900\n",
      "3000\n",
      "3100\n",
      "3200\n",
      "3300\n",
      "3400\n",
      "3500\n",
      "3600\n",
      "3700\n",
      "3800\n",
      "3900\n",
      "1981\n",
      "0\n",
      "100\n",
      "200\n",
      "300\n",
      "400\n",
      "500\n",
      "600\n",
      "700\n",
      "800\n",
      "900\n",
      "1000\n",
      "1100\n",
      "1200\n",
      "1300\n",
      "1400\n",
      "1500\n",
      "1600\n",
      "1700\n",
      "1800\n",
      "1900\n",
      "2000\n",
      "2100\n",
      "2200\n",
      "2300\n",
      "2400\n",
      "1993\n",
      "0\n",
      "100\n",
      "200\n",
      "300\n",
      "400\n",
      "500\n",
      "600\n",
      "700\n",
      "800\n",
      "900\n",
      "1000\n",
      "1100\n",
      "1200\n",
      "1300\n",
      "1400\n",
      "1500\n",
      "1600\n",
      "1700\n",
      "1800\n",
      "1900\n",
      "2000\n",
      "2100\n",
      "2200\n",
      "2300\n",
      "2400\n",
      "2500\n",
      "2600\n",
      "2700\n",
      "2800\n",
      "2900\n",
      "3000\n",
      "3100\n",
      "3200\n",
      "3300\n",
      "3400\n",
      "3500\n",
      "3600\n",
      "3700\n",
      "3800\n",
      "3900\n",
      "4000\n",
      "4100\n",
      "4200\n",
      "4300\n",
      "4400\n",
      "4500\n",
      "4600\n",
      "4700\n",
      "4800\n",
      "1997\n",
      "0\n",
      "100\n",
      "200\n",
      "300\n",
      "400\n",
      "500\n",
      "600\n",
      "700\n",
      "800\n",
      "900\n",
      "1000\n",
      "1100\n",
      "1200\n",
      "1300\n",
      "1400\n",
      "1500\n",
      "1600\n",
      "1700\n",
      "1800\n",
      "1900\n",
      "2000\n",
      "2100\n",
      "2200\n",
      "2300\n",
      "2400\n",
      "2500\n",
      "2600\n",
      "2700\n",
      "2800\n",
      "2900\n",
      "3000\n",
      "3100\n",
      "3200\n",
      "3300\n",
      "3400\n",
      "3500\n",
      "3600\n",
      "3700\n",
      "3800\n",
      "3900\n",
      "4000\n",
      "4100\n",
      "4200\n",
      "4300\n",
      "4400\n",
      "4500\n",
      "4600\n",
      "4700\n",
      "4800\n",
      "4900\n",
      "5000\n",
      "5100\n",
      "5200\n",
      "5300\n",
      "5400\n",
      "2017\n",
      "0\n",
      "100\n",
      "200\n",
      "300\n",
      "400\n",
      "500\n",
      "600\n",
      "700\n",
      "800\n",
      "900\n",
      "1000\n",
      "1100\n",
      "1200\n",
      "1300\n",
      "1400\n",
      "1500\n",
      "1600\n",
      "1700\n",
      "1800\n",
      "1900\n",
      "2000\n",
      "2100\n",
      "2200\n",
      "2300\n",
      "2400\n",
      "2500\n",
      "2600\n",
      "2700\n",
      "2800\n",
      "2900\n",
      "3000\n",
      "3100\n",
      "3200\n",
      "3300\n",
      "3400\n",
      "3500\n",
      "3600\n",
      "3700\n",
      "3800\n",
      "3900\n",
      "4000\n",
      "4100\n",
      "4200\n",
      "4300\n",
      "4400\n",
      "4500\n",
      "4600\n",
      "4700\n",
      "4800\n",
      "4900\n"
     ]
    }
   ],
   "source": [
    "%run notebooks/Find_words.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
